{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "initial_id",
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gym\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# 创建FrozenLake环境\n",
    "env = gym.make('FrozenLake-v1')\n",
    "\n",
    "# Q-learning 参数\n",
    "alpha = 0.85  # 学习率\n",
    "gamma = 0.99  # 折扣因子\n",
    "\n",
    "# Q-table 初始化\n",
    "q_table = np.zeros([env.observation_space.n, env.action_space.n])\n",
    "\n",
    "# 训练参数\n",
    "num_episodes = 10000\n",
    "max_steps_per_episode = 200\n",
    "\n",
    "for episode in range(num_episodes):\n",
    "    state, init_info = env.reset()\n",
    "    total_reward = 0\n",
    "    for step in range(max_steps_per_episode):\n",
    "        # 选择动作，可以使用ε-greedy策略\n",
    "        action = np.argmax(q_table[state, :] + np.random.randn(1, 4) * (1 / (episode + 1)))  # 选择最优动作\n",
    "\n",
    "        # 执行动作\n",
    "        new_state, reward, done, truncated, info = env.step(action)\n",
    "\n",
    "        # 更新Q值\n",
    "        target = reward + gamma * np.max(q_table[new_state, :])\n",
    "        update = alpha * (target - q_table[state, action])\n",
    "        q_table[state, action] += update\n",
    "\n",
    "        # # 更新Q值\n",
    "        # q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (\n",
    "        #         reward + gamma * np.max(q_table[new_state, :]))\n",
    "\n",
    "        state = new_state\n",
    "        total_reward += reward\n",
    "\n",
    "        if done:\n",
    "            break\n",
    "\n",
    "    print(f\"Episode {episode + 1}, Total Reward: {total_reward}\")"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ccea877542e21009",
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# 使用学到的Q值进行游戏\n",
    "env = gym.make('FrozenLake-v1', render_mode=\"human\")\n",
    "num_episodes_play = 10\n",
    "\n",
    "for _ in range(num_episodes_play):\n",
    "    state, init_info = env.reset()\n",
    "    total_reward = 0\n",
    "\n",
    "    for step in range(max_steps_per_episode):\n",
    "        # 选择最优动作\n",
    "        action = np.argmax(q_table[state, :])\n",
    "\n",
    "        # 执行动作\n",
    "        new_state, reward, done, truncated, info = env.step(action)\n",
    "        state = new_state\n",
    "        total_reward += reward\n",
    "\n",
    "        if done:\n",
    "            break\n",
    "\n",
    "    print(f\"Playing Episode, Total Reward: {total_reward}\")"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "a20625f125e85853",
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "env.close()  # 关闭环境"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "5329466b7f8231a8",
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
